# clear environment
rm(list=ls())

# load data set with keyword counts and crime numbers
dat = read.csv("police_metadata.csv")

# create logged ratio on issue level
dat$kw_ratio_log <- log(dat$count_right_kw_restricted + 0.5) - log(dat$count_left_kw_restricted + 0.5)

# create crime ratio
dat$extreme_crime_ratio_log <- log(dat$extreme_crime_right + 0.5) - log(dat$extreme_crime_left + 0.5)

# impute missing values in crime ratio with federal level crime ratio
bund_dat <- dat[dat$jurisdiction=="bund", c("year", "extreme_crime_right", "extreme_crime_left")]
bund_dat$extreme_crime_ratio_log_fed <- log(bund_dat$extreme_crime_right + 0.5) - log(bund_dat$extreme_crime_left + 0.5)
bund_dat <- bund_dat[,c("year", "extreme_crime_ratio_log_fed")]
bund_dat = bund_dat[!duplicated(bund_dat$year),]

dat <- merge(dat, bund_dat, by="year", all.x=T)

dat$extreme_crime_ratio_log_imp <- ifelse(is.na(dat$extreme_crime_ratio_log), dat$extreme_crime_ratio_log_fed, dat$extreme_crime_ratio_log)

# create bias measures
dat$bias_ratio_extreme_imp <- dat$kw_ratio_log - dat$extreme_crime_ratio_log_imp

# change reference level for police unions
dat$union <- factor(dat$union, levels=c("gdp", "dpolg"))

## regression models (NOTE: we use section fixed effects, which are almost the same as jurisdictions. For simplicity, in the paper we refer to them as "jurisdiction fixed effects")
# position
mod_1 <- lm(kw_ratio_log~factor(union), data=dat)
mod_2 <- lm(kw_ratio_log~factor(union)+factor(section), data=dat)
mod_3 <- lm(kw_ratio_log~factor(union)+factor(section)+factor(year), data=dat)
# bias
mod_4 <- lm(bias_ratio_extreme_imp~factor(union), data=dat)
mod_5 <- lm(bias_ratio_extreme_imp~factor(union)+factor(section), data=dat)
mod_6 <- lm(bias_ratio_extreme_imp~factor(union)+factor(section)+factor(year), data=dat)